import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.plotly as py
from plotly import tools
from datetime import date
import seaborn as sns
import random
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
import zipfile
import os
import winsound
import sys
sys.path.append('c:\\users\\chait\\anaconda3\\lib\\site-packages')
"""Helper Functions to do Plotting"""
def generateLayoutBar(col_name):
"""
Generate a layout object for bar chart
"""
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title="Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
# range=[0,100],
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the y axis
color='black' # color of the font
)
),
font=dict(
family='Courier New, monospace', # font family
color="white", # color of the font
size=12 # size of the font displayed on the bar
)
)
return layout_bar
def plotBar(dataframe_name, col_name, top_n=None):
"""
Plot a bar chart for the categorical columns
Arguments:
dataframe name
categorical column name
Output:
Plot
"""
# create a table with value counts
temp = dataframe_name[col_name].value_counts()
if top_n is not None:
temp = temp.head(top_n)
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100, # y axis values
text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition='auto', # specify at which position on the bar the text should appear
marker=dict(color='#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generateLayoutBar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
outpath = "../data/"
os.listdir(outpath)
# for i in os.listdir(outpath):
# z = zipfile.ZipFile(outpath + i)
# z.extractall(path = outpath)
# z.close()
for i in os.listdir(outpath):
print(i)
datapath = '../data/Data/'
for i in os.listdir(datapath):
print(i)
Customer_Demographics = pd.read_excel(datapath + 'Customer_Demographics.xlsx')
Customer_Transaction = pd.read_excel(datapath + 'Customer_Transaction.xlsx')
Store_Master = pd.read_excel(datapath + 'Store_Master.xlsx')
Test_Set = pd.read_excel(datapath + 'Test_Set.xlsx')
[i.shape for i in [Customer_Demographics, Customer_Transaction, Store_Master, Test_Set]]
No information about the units given in the data. Assuming Revenue in dhirams.
Customer_Demographics.head()
Customer_Demographics.dtypes
from datetime import datetime
datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p')
Customer_Demographics.First_txn_dt[0]
t = datetime.strptime('02NOV2010:00:00:00', '%d%b%Y:%H:%M:%S')
date_cols_demo = [ i for i in Customer_Demographics.columns if (i.endswith('dt')) or ('date' in i)]
date_cols_demo
Customer_Demographics.First_txn_dt.dtype
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].astype('O')
def convertToDate(x):
try:
return datetime.strptime(x, '%d%b%Y:%H:%M:%S')
except:
return x
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].applymap(lambda x : convertToDate(x))
date_cols_demo
def extractColTypes(dataset):
"""This functions extracts numeric, categorical , datetime and boolean column types.
Returns 4 lists with respective column types"""
num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
print ("Numeric Columns:", len(num_cols_list))
print ("Categorical/Character Columns:", len(cat_cols_list))
print ("Date Columns:",len(date_cols_list))
print ("Boolean Columns:",len(bool_cols_list))
return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)
demo_num_cols_list,demo_cat_cols_list,demo_date_cols_list,demo_bool_cols_list = extractColTypes(Customer_Demographics)
demo_cat_cols_list
today = convertToDate('24NOV2018:00:00:00')
def getDaysDiff(x):
t = -1*(x - today)
return t.apply(lambda x : x.days)
Customer_Demographics['custSince'] = getDaysDiff(Customer_Demographics.First_txn_dt)
Customer_Demographics['ds_accr']= getDaysDiff(Customer_Demographics.Last_accr_txn_dt)
Customer_Demographics['ds_rdm']= getDaysDiff(Customer_Demographics.Last_rdm_txn_dt)
for i in demo_cat_cols_list[1:]:
plotBar(Customer_Demographics, i,top_n=10)
Some columns here like the Income and Marital status are unspecified, as of now we cannot decide if they will help in the prediction, but we should keep a close watch for these.
demo_num_cols_list
def plotHist(dataframe, col_name):
data = [go.Histogram(x=dataframe[col_name],
marker=dict(
color='#CC0E1D',# Lava (#CC0E1D)
# color = 'rgb(200,0,0)' # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
))]
layout = go.Layout(title = "Histogram of {}".format(col_name))
fig = go.Figure(data= data, layout=layout)
return iplot(fig)
plotHist(Customer_Demographics, 'Age')
Looks like the age group of 30-40 are frequent visitors of the store, also we see some entries above 100, I will remove them
Customer_Demographics.Age = Customer_Demographics.Age.clip_upper(100)
plotHist(Customer_Demographics, 'Age')
plotHist(Customer_Demographics, 'Points')
Most people have 0-2000 points
Customer_Demographics[Customer_Demographics.Points>2000].shape
Just above 1000 customers have more than 2000 points
Let us see if these customers are distinguishable from the rest
for i in demo_cat_cols_list[1:]:
plotBar(Customer_Demographics[Customer_Demographics.Points>2000], i,top_n=10)
The distribution of all the above columns look very similar to those of the unfiltered data. There is no disctinguishing factor as such.
def plotMultiBox(dataframe,col_name, num_col_name):
data = []
for i in dataframe[col_name].unique():
trace = go.Box(y = dataframe[num_col_name][dataframe[col_name] == i],
name = i)
data.append(trace)
layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name,num_col_name))
fig = go.Figure(data=data, layout=layout)
return (iplot(fig))
plotMultiBox(Customer_Demographics, 'Loyalty_Status', 'custSince')
We see that in general, gold customers are with the company longer
plotMultiBox(Customer_Demographics, 'Gender', 'custSince')
Females have been customers for a marginally more number of days than males.
plotMultiBox(Customer_Demographics, 'Income_Range', 'custSince')
Observations:
- An interesting observation here, customers who earn more than 20,000 Dhirams have been with the company for over 2500 days, also there are no recent entries for such high income groups.
- The recent entries are from <20000 dhiram earning group. May be the store now has more affordable goods.
- This could also be a result of the location of the malls
Customer_Transaction.columns
Customer_Transaction.dtypes
Customer_Transaction.head(20)
plotHist(Customer_Transaction,'Revenue')